In [914]:
!pip install arch
Requirement already satisfied: arch in /usr/local/lib/python3.7/dist-packages (5.1.0)
Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.7/dist-packages (from arch) (1.19.5)
Requirement already satisfied: property-cached>=1.6.4 in /usr/local/lib/python3.7/dist-packages (from arch) (1.6.4)
Requirement already satisfied: statsmodels>=0.11 in /usr/local/lib/python3.7/dist-packages (from arch) (0.13.1)
Requirement already satisfied: scipy>=1.3 in /usr/local/lib/python3.7/dist-packages (from arch) (1.4.1)
Requirement already satisfied: pandas>=1.0 in /usr/local/lib/python3.7/dist-packages (from arch) (1.1.5)
Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas>=1.0->arch) (2.8.2)
Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.7/dist-packages (from pandas>=1.0->arch) (2018.9)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.7.3->pandas>=1.0->arch) (1.15.0)
Requirement already satisfied: patsy>=0.5.2 in /usr/local/lib/python3.7/dist-packages (from statsmodels>=0.11->arch) (0.5.2)
In [915]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import date
from scipy import stats

%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

from datetime import datetime, timedelta
from sklearn.metrics import mean_squared_error as mse
from sklearn.preprocessing import MinMaxScaler
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

import ipywidgets as widgets
from IPython.display import display
In [1593]:
w = widgets.Dropdown(
    options=['SELECT','AAPL', 'ABUS', 'ARDS', 'BABA','BFRI', 
             'FB', 'GME', 'MCD','PFE', 'PLUG', 
             'QCOM', 'SENS','TSLA', 'TWTR', 'UUUU'],
    value='SELECT',
    description ='Stock name:',

)

def on_change(change):
    if change['type'] == 'change' and change['name'] == 'value':
        print("You have selected %s" % change['new'])

w.observe(on_change)

display(w)
You have selected MCD
In [1594]:
if(w.value == 'AAPL'):
  df = pd.read_csv('/content/Final_AAPL.csv')
if(w.value == 'ABUS'):
  df = pd.read_csv('/content/Final_ABUS.csv')
if(w.value == 'ARDS'):
  df = pd.read_csv('/content/Final_ARDS.csv')
if(w.value == 'BABA'):
  df = pd.read_csv('/content/Final_BABA.csv')
if(w.value == 'BFRI'):
  df = pd.read_csv('/content/Final_BFRI.csv')
if(w.value == 'FB'):
  df = pd.read_csv('/content/Final_FB.csv')
if(w.value == 'GME'):
  df = pd.read_csv('/content/Final_GME.csv')
if(w.value == 'MCD'):
  df = pd.read_csv('/content/Final_MCD.csv')
if(w.value == 'PFE'):
  df = pd.read_csv('/content/Final_PFE.csv')
if(w.value == 'PLUG'):
  df = pd.read_csv('/content/Final_PLUG.csv')
if(w.value == 'QCOM'):
  df = pd.read_csv('/content/Final_QCOM.csv')
if(w.value == 'SENS'):
  df = pd.read_csv('/content/Final_SENS.csv')
if(w.value == 'TSLA'):
  df = pd.read_csv('/content/Final_TSLA.csv')
if(w.value == 'TWTR'):
  df = pd.read_csv('/content/Final_TWTR.csv')
if(w.value == 'UUUU'):
  df = pd.read_csv('/content/Final_UUUU.csv')
In [1595]:
pd.set_option('display.max_colwidth', None)
In [1596]:
df['Date'] = df['Date'].astype("datetime64[ns]")
In [1597]:
del df['Unnamed: 0']
In [1598]:
df.head(5)
Out[1598]:
Date Open High Low Close Adj Close Volume Return Beta Variance AvgTrueRange Upperband Lowerband Middleband APO NATR TRANGE DMI MACD MACDSIGNAL MACDHIST MOM PPO ROCP RSI TRIX ULTOSC SLOWK SLOWD AD ADOSC OBV Upward_momentum_created Downward_momentum_created B5_O_Um B5_C_Um B5_E_Um B5_A_Um B5_N_Um B5_O_Dm B5_C_Dm B5_E_Dm B5_A_Dm B5_N_Dm Verified_status_True Verified_status_False O C E A N Real_or_Fake_tweet
0 2009-06-02 59.570000 60.770000 59.480000 60.380001 41.666481 9332000 0.885552 0.463308 1.138162 1.391112 60.860837 56.593449 58.727143 NaN 2.303929 1.290001 81.605162 NaN NaN NaN 6.180000 NaN 0.114022 77.063460 NaN NaN 94.289443 89.265645 -4.069393e+06 8.090678e+06 77431600.0 0.0 4.445900e+04 0.0 0.0 0.0 0.0 0.0 4.445900e+04 0.0 0.0 0.0 0.0 1 4 5 0 0 0 0 5
1 2009-06-03 59.849998 61.009998 59.750000 60.990002 42.087437 10424500 1.010269 0.774925 1.170082 1.372382 61.449122 57.122308 59.285715 NaN 2.250175 1.259998 82.644062 NaN NaN NaN 7.120003 NaN 0.132170 78.427263 NaN NaN 94.503974 93.204151 6.024225e+06 1.118332e+07 87856100.0 0.0 1.109121e+06 0.0 0.0 0.0 0.0 0.0 1.109121e+06 0.0 0.0 0.0 0.0 3 13 16 0 0 0 0 16
2 2009-06-04 60.650002 60.650002 59.799999 60.240002 41.913494 10236900 -1.229710 0.773326 1.231797 1.346327 61.705442 57.265988 59.485715 NaN 2.234939 1.190002 82.644062 NaN NaN NaN 3.990002 NaN 0.070933 72.703340 NaN NaN 88.226906 92.340108 6.385558e+06 1.154680e+07 77619200.0 0.0 6.275000e+04 0.0 0.0 0.0 0.0 0.0 6.275000e+04 0.0 0.0 0.0 0.0 0 4 4 0 0 0 0 4
3 2009-06-05 60.400002 60.490002 59.349998 59.869999 41.656044 10285700 -0.614214 0.986750 0.770755 1.316853 61.534425 58.022719 59.778572 NaN 2.199520 1.140003 59.634697 NaN NaN NaN 4.180000 NaN 0.075058 69.989477 NaN NaN 73.902661 85.544514 5.483285e+06 1.035870e+07 67333500.0 0.0 8.475000e+03 0.0 0.0 0.0 0.0 0.0 8.475000e+03 0.0 0.0 0.0 0.0 0 2 2 0 0 0 0 2
4 2009-06-08 58.580002 59.180000 57.750000 58.720001 40.855896 14756100 -1.920825 1.115790 0.535478 1.431588 61.326385 58.399331 59.862858 2.724103 2.437990 2.119999 4.844916 NaN NaN NaN 1.639999 4.850677 0.028732 62.216049 NaN NaN 50.672187 70.933918 1.074597e+07 1.060545e+07 52577400.0 0.0 2.509982e+06 0.0 0.0 0.0 0.0 0.0 2.509982e+06 0.0 0.0 0.0 0.0 15 10 25 0 0 0 0 25
In [1599]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3153 entries, 0 to 3152
Data columns (total 52 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   Date                       3153 non-null   datetime64[ns]
 1   Open                       3153 non-null   float64       
 2   High                       3153 non-null   float64       
 3   Low                        3153 non-null   float64       
 4   Close                      3153 non-null   float64       
 5   Adj Close                  3153 non-null   float64       
 6   Volume                     3153 non-null   int64         
 7   Return                     3153 non-null   float64       
 8   Beta                       3153 non-null   float64       
 9   Variance                   3153 non-null   float64       
 10  AvgTrueRange               3153 non-null   float64       
 11  Upperband                  3153 non-null   float64       
 12  Lowerband                  3153 non-null   float64       
 13  Middleband                 3153 non-null   float64       
 14  APO                        3149 non-null   float64       
 15  NATR                       3153 non-null   float64       
 16  TRANGE                     3153 non-null   float64       
 17  DMI                        3153 non-null   float64       
 18  MACD                       3141 non-null   float64       
 19  MACDSIGNAL                 3141 non-null   float64       
 20  MACDHIST                   3141 non-null   float64       
 21  MOM                        3153 non-null   float64       
 22  PPO                        3149 non-null   float64       
 23  ROCP                       3153 non-null   float64       
 24  RSI                        3153 non-null   float64       
 25  TRIX                       3087 non-null   float64       
 26  ULTOSC                     3146 non-null   float64       
 27  SLOWK                      3153 non-null   float64       
 28  SLOWD                      3153 non-null   float64       
 29  AD                         3153 non-null   float64       
 30  ADOSC                      3153 non-null   float64       
 31  OBV                        3153 non-null   float64       
 32  Upward_momentum_created    3153 non-null   float64       
 33  Downward_momentum_created  3153 non-null   float64       
 34  B5_O_Um                    3153 non-null   float64       
 35  B5_C_Um                    3153 non-null   float64       
 36  B5_E_Um                    3153 non-null   float64       
 37  B5_A_Um                    3153 non-null   float64       
 38  B5_N_Um                    3153 non-null   float64       
 39  B5_O_Dm                    3153 non-null   float64       
 40  B5_C_Dm                    3153 non-null   float64       
 41  B5_E_Dm                    3153 non-null   float64       
 42  B5_A_Dm                    3153 non-null   float64       
 43  B5_N_Dm                    3153 non-null   float64       
 44  Verified_status_True       3153 non-null   int64         
 45  Verified_status_False      3153 non-null   int64         
 46  O                          3153 non-null   int64         
 47  C                          3153 non-null   int64         
 48  E                          3153 non-null   int64         
 49  A                          3153 non-null   int64         
 50  N                          3153 non-null   int64         
 51  Real_or_Fake_tweet         3153 non-null   int64         
dtypes: datetime64[ns](1), float64(42), int64(9)
memory usage: 1.3 MB
In [1600]:
df.shape
Out[1600]:
(3153, 52)
In [1601]:
sns.set(font_scale=0.8)
In [1602]:
# CHANGE CONTEXT TO poster TO INCREASE FONT SIZES
sns.set_context("talk", font_scale=1.3)

# PLOT OUT BTC-USE'S CLOSING PRICES SINCE 2014
with sns.axes_style("darkgrid"):
    fig, ax = plt.subplots(figsize=(18,8))
    sns.lineplot(x=df.Date, y=df.Close, color='blue')
    ax.set_title('Closing Price')    
In [1603]:
# CALCULATE PRICE RETURNS AS DAILY PERCENTAGE CHANGE USING pct_change()
df['returns'] = 100 * df.Close.pct_change().dropna()
In [1604]:
# CALCULATE LOG RETURNS BASED ON ABOVE FORMULA
df['log_returns'] = np.log(df.Close/df.Close.shift(1))
In [1605]:
df.head()
Out[1605]:
Date Open High Low Close Adj Close Volume Return Beta Variance AvgTrueRange Upperband Lowerband Middleband APO NATR TRANGE DMI MACD MACDSIGNAL MACDHIST MOM PPO ROCP RSI TRIX ULTOSC SLOWK SLOWD AD ADOSC OBV Upward_momentum_created Downward_momentum_created B5_O_Um B5_C_Um B5_E_Um B5_A_Um B5_N_Um B5_O_Dm B5_C_Dm B5_E_Dm B5_A_Dm B5_N_Dm Verified_status_True Verified_status_False O C E A N Real_or_Fake_tweet returns log_returns
0 2009-06-02 59.570000 60.770000 59.480000 60.380001 41.666481 9332000 0.885552 0.463308 1.138162 1.391112 60.860837 56.593449 58.727143 NaN 2.303929 1.290001 81.605162 NaN NaN NaN 6.180000 NaN 0.114022 77.063460 NaN NaN 94.289443 89.265645 -4.069393e+06 8.090678e+06 77431600.0 0.0 4.445900e+04 0.0 0.0 0.0 0.0 0.0 4.445900e+04 0.0 0.0 0.0 0.0 1 4 5 0 0 0 0 5 NaN NaN
1 2009-06-03 59.849998 61.009998 59.750000 60.990002 42.087437 10424500 1.010269 0.774925 1.170082 1.372382 61.449122 57.122308 59.285715 NaN 2.250175 1.259998 82.644062 NaN NaN NaN 7.120003 NaN 0.132170 78.427263 NaN NaN 94.503974 93.204151 6.024225e+06 1.118332e+07 87856100.0 0.0 1.109121e+06 0.0 0.0 0.0 0.0 0.0 1.109121e+06 0.0 0.0 0.0 0.0 3 13 16 0 0 0 0 16 1.010269 0.010052
2 2009-06-04 60.650002 60.650002 59.799999 60.240002 41.913494 10236900 -1.229710 0.773326 1.231797 1.346327 61.705442 57.265988 59.485715 NaN 2.234939 1.190002 82.644062 NaN NaN NaN 3.990002 NaN 0.070933 72.703340 NaN NaN 88.226906 92.340108 6.385558e+06 1.154680e+07 77619200.0 0.0 6.275000e+04 0.0 0.0 0.0 0.0 0.0 6.275000e+04 0.0 0.0 0.0 0.0 0 4 4 0 0 0 0 4 -1.229710 -0.012373
3 2009-06-05 60.400002 60.490002 59.349998 59.869999 41.656044 10285700 -0.614214 0.986750 0.770755 1.316853 61.534425 58.022719 59.778572 NaN 2.199520 1.140003 59.634697 NaN NaN NaN 4.180000 NaN 0.075058 69.989477 NaN NaN 73.902661 85.544514 5.483285e+06 1.035870e+07 67333500.0 0.0 8.475000e+03 0.0 0.0 0.0 0.0 0.0 8.475000e+03 0.0 0.0 0.0 0.0 0 2 2 0 0 0 0 2 -0.614214 -0.006161
4 2009-06-08 58.580002 59.180000 57.750000 58.720001 40.855896 14756100 -1.920825 1.115790 0.535478 1.431588 61.326385 58.399331 59.862858 2.724103 2.437990 2.119999 4.844916 NaN NaN NaN 1.639999 4.850677 0.028732 62.216049 NaN NaN 50.672187 70.933918 1.074597e+07 1.060545e+07 52577400.0 0.0 2.509982e+06 0.0 0.0 0.0 0.0 0.0 2.509982e+06 0.0 0.0 0.0 0.0 15 10 25 0 0 0 0 25 -1.920825 -0.019395
In [1606]:
# DROPPING THE 1ST ROW OF DATA 
# BECAUSE I SHIFTED IT FORWARD TO CALCULATE RETURNS/LOG RETURNS
df.dropna(inplace=True)
In [1607]:
# PLOT DISTRIBUTION PLOTS OF RETURNS & LOG RETURNS
# AND VISUALLY COMPARE THEM WITH THE STANDARD NORMAL DISTRIBUTION
with sns.axes_style("darkgrid"):
    fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(18,12))

    axes[0][0].plot(df.returns, color='blue')
    axes[0][0].set_title('Returns')

    sns.distplot(df.returns, norm_hist=True, fit=stats.norm, color='blue',
                bins=50, ax=axes[0][1])
    axes[0][1].set_title('Returns')

    axes[1][0].plot(df.log_returns, color='green')
    axes[1][0].set_title('Log Returns')

    sns.distplot(df.log_returns, norm_hist=True, fit=stats.norm, color='green',
                bins=50, ax=axes[1][1])
    axes[1][1].set_title('Log Returns')
    plt.tight_layout()
    fig.show();
In [1608]:
# CREATE A FUNCTION THAT CALCULATE REALIZED VOLATILITY
# FROM SAILY LOG RETURNS
def realized_volatility_daily(series_log_return):
    """
    Get the daily realized volatility which is calculated as the square root
    of sum of squares of log returns within a specific window interval 
    """
    n = len(series_log_return)
    return np.sqrt(np.sum(series_log_return**2)/(n - 1))
In [1609]:
intervals = [7, 30, 60, 180, 365]
vols_df = {}

# ITERATE OVER intervals LIST
for i in intervals:
    # GET DAILY LOG RETURNS USING THAT INTERVAL
    vols = df.log_returns.rolling(window=i)\
                         .apply(realized_volatility_daily).values

    vols_df[i] = vols

# CONVERT vols_df FROM DICTIONARY TO PANDAS DATAFRAME
vols_df = pd.DataFrame(vols_df, columns=intervals, index=df.index)
In [1610]:
# CHANGING MATPLOTLIB STYLE
plt.style.use(['fivethirtyeight'])

fig, ax = plt.subplots(figsize=(18,7))

for i in intervals:
    if i == 7:
        alpha = 0.5
        lw = 1
    else:
        alpha = 1.0
        lw = 2
    ax.plot(vols_df[i], label=f'{i}-Day Interval Realized Volatility', 
            alpha=alpha, lw=lw)

ax.set_title('Realized Volatility Using Different Interval Windows', fontsize=21)

plt.legend(loc='best', prop={'size': 14})
plt.show();
In [1611]:
INTERVAL_WINDOW = 30
n_future = 7

# GET BACKWARD LOOKING REALIZED VOLATILITY
df['vol_current'] = df.log_returns.rolling(window=INTERVAL_WINDOW)\
                                   .apply(realized_volatility_daily)

# GET FORWARD LOOKING REALIZED VOLATILITY 
df['vol_future'] = df.log_returns.shift(-n_future)\
                                 .rolling(window=INTERVAL_WINDOW)\
                                 .apply(realized_volatility_daily)
In [1612]:
df.describe()
Out[1612]:
Open High Low Close Adj Close Volume Return Beta Variance AvgTrueRange Upperband Lowerband Middleband APO NATR TRANGE DMI MACD MACDSIGNAL MACDHIST MOM PPO ROCP RSI TRIX ULTOSC SLOWK SLOWD AD ADOSC OBV Upward_momentum_created Downward_momentum_created B5_O_Um B5_C_Um B5_E_Um B5_A_Um B5_N_Um B5_O_Dm B5_C_Dm B5_E_Dm B5_A_Dm B5_N_Dm Verified_status_True Verified_status_False O C E A N Real_or_Fake_tweet returns log_returns vol_current vol_future
count 3087.000000 3087.000000 3087.000000 3087.000000 3087.000000 3.087000e+03 3087.000000 3087.000000 3087.000000 3087.000000 3087.000000 3087.000000 3087.000000 3087.000000 3087.000000 3087.000000 3087.000000 3087.000000 3087.000000 3087.000000 3087.000000 3087.000000 3087.000000 3087.000000 3087.000000 3087.000000 3087.000000 3087.000000 3.087000e+03 3.087000e+03 3.087000e+03 3087.0 3.087000e+03 3087.0 3087.0 3087.0 3087.0 3087.0 3.087000e+03 3087.0 3087.0 3087.0 3087.0 3087.000000 3087.000000 3087.000000 3087.0 3087.0 3087.0 3087.0 3087.000000 3087.000000 3087.000000 3058.000000 3051.000000
mean 131.984250 132.915740 131.069368 132.008678 116.698482 5.039562e+06 0.057566 0.754357 3.301995 1.991805 134.358097 129.256481 131.807289 0.449126 1.463744 1.998646 32.450604 0.446758 0.442564 0.004194 0.665296 0.344225 0.005524 54.208086 0.047440 51.408601 55.418598 55.409031 3.413349e+07 1.274834e+05 5.534145e+08 0.0 2.254863e+06 0.0 0.0 0.0 0.0 0.0 2.254863e+06 0.0 0.0 0.0 0.0 4.766764 69.863298 74.630062 0.0 0.0 0.0 0.0 74.630062 0.057283 0.000502 0.010428 0.010429
std 53.134139 53.563206 52.681888 53.123387 58.195973 2.700829e+06 1.190401 0.436954 15.158841 1.516414 54.205060 51.867377 52.985877 2.489437 0.765769 1.904328 21.191262 1.705423 1.584702 0.556799 5.130147 1.572726 0.032556 11.534948 0.099839 9.729560 24.454263 22.550256 4.064844e+07 3.499145e+06 2.502854e+08 0.0 4.949033e+06 0.0 0.0 0.0 0.0 0.0 4.949033e+06 0.0 0.0 0.0 0.0 10.226281 69.707588 77.120294 0.0 0.0 0.0 0.0 77.120294 1.190759 0.011895 0.006160 0.006167
min 54.099998 54.340000 53.880001 54.230000 38.064846 9.632000e+05 -15.875351 -1.235812 0.009706 0.633286 56.492178 53.392286 55.128571 -24.743270 0.713757 0.250000 0.021493 -17.588256 -13.691809 -6.540312 -69.720001 -13.686135 -0.336779 20.033009 -0.381169 19.665817 0.827827 3.214241 -7.760304e+07 -1.719809e+07 -1.920340e+07 0.0 1.600000e+01 0.0 0.0 0.0 0.0 0.0 1.600000e+01 0.0 0.0 0.0 0.0 0.000000 0.000000 1.000000 0.0 0.0 0.0 0.0 1.000000 -15.875351 -0.172871 0.003280 0.003280
25% 93.269997 93.820000 92.755001 93.240002 73.543133 3.215350e+06 -0.476608 0.493912 0.346494 1.072399 94.748260 91.368485 93.173573 -0.472917 1.101898 0.959999 14.715431 -0.309931 -0.255675 -0.201862 -1.220001 -0.406025 -0.010944 46.465921 -0.020536 44.983621 35.740164 37.455402 3.448097e+06 -1.796716e+06 3.308203e+08 0.0 4.982322e+05 0.0 0.0 0.0 0.0 0.0 4.982322e+05 0.0 0.0 0.0 0.0 1.000000 31.000000 33.000000 0.0 0.0 0.0 0.0 33.000000 -0.478001 -0.004791 0.007585 0.007582
50% 111.709999 112.349998 110.830002 111.260002 95.557869 4.431600e+06 0.075252 0.766876 0.813421 1.474209 113.117403 109.266895 111.965715 0.508205 1.319313 1.449997 30.070558 0.442623 0.441487 0.017684 0.809998 0.438565 0.007316 54.483394 0.053176 51.752956 57.930936 58.022557 4.116313e+07 2.425201e+05 5.635624e+08 0.0 9.626650e+05 0.0 0.0 0.0 0.0 0.0 9.626650e+05 0.0 0.0 0.0 0.0 2.000000 53.000000 56.000000 0.0 0.0 0.0 0.0 56.000000 0.075252 0.000752 0.009179 0.009173
75% 175.004997 176.595001 173.775002 175.129997 162.193283 6.109200e+06 0.600937 0.993481 2.224161 2.470715 180.204366 171.614563 175.028570 1.416794 1.605691 2.419998 47.962398 1.196784 1.118214 0.221888 2.740002 1.226771 0.024115 62.577164 0.122249 58.057533 76.528675 74.264318 6.674037e+07 2.137691e+06 8.015311e+08 0.0 2.037151e+06 0.0 0.0 0.0 0.0 0.0 2.037151e+06 0.0 0.0 0.0 0.0 5.000000 85.000000 88.000000 0.0 0.0 0.0 0.0 88.000000 0.601190 0.005994 0.011621 0.011629
max 265.559998 266.890015 264.089996 265.549988 265.549988 3.647160e+07 18.125448 2.761386 460.722560 17.786362 269.082909 260.789813 263.527139 13.018460 11.989368 29.500000 95.186611 4.978176 4.727066 4.368476 39.939987 7.768897 0.291320 91.603821 0.287018 81.418866 97.422844 97.123851 1.303757e+08 1.469641e+07 8.737332e+08 0.0 8.184269e+07 0.0 0.0 0.0 0.0 0.0 8.184269e+07 0.0 0.0 0.0 0.0 130.000000 844.000000 913.000000 0.0 0.0 0.0 0.0 913.000000 18.125448 0.166577 0.065226 0.065226
In [1613]:
df.rename(columns = {'Real_or_Fake_tweet': 'Fake_news'}, inplace = True)
In [1614]:
df = df.fillna(df.median())
In [1615]:
df.isna().sum()
Out[1615]:
Date                         0
Open                         0
High                         0
Low                          0
Close                        0
Adj Close                    0
Volume                       0
Return                       0
Beta                         0
Variance                     0
AvgTrueRange                 0
Upperband                    0
Lowerband                    0
Middleband                   0
APO                          0
NATR                         0
TRANGE                       0
DMI                          0
MACD                         0
MACDSIGNAL                   0
MACDHIST                     0
MOM                          0
PPO                          0
ROCP                         0
RSI                          0
TRIX                         0
ULTOSC                       0
SLOWK                        0
SLOWD                        0
AD                           0
ADOSC                        0
OBV                          0
Upward_momentum_created      0
Downward_momentum_created    0
B5_O_Um                      0
B5_C_Um                      0
B5_E_Um                      0
B5_A_Um                      0
B5_N_Um                      0
B5_O_Dm                      0
B5_C_Dm                      0
B5_E_Dm                      0
B5_A_Dm                      0
B5_N_Dm                      0
Verified_status_True         0
Verified_status_False        0
O                            0
C                            0
E                            0
A                            0
N                            0
Fake_news                    0
returns                      0
log_returns                  0
vol_current                  0
vol_future                   0
dtype: int64
In [1616]:
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 3087 entries, 66 to 3152
Data columns (total 56 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   Date                       3087 non-null   datetime64[ns]
 1   Open                       3087 non-null   float64       
 2   High                       3087 non-null   float64       
 3   Low                        3087 non-null   float64       
 4   Close                      3087 non-null   float64       
 5   Adj Close                  3087 non-null   float64       
 6   Volume                     3087 non-null   int64         
 7   Return                     3087 non-null   float64       
 8   Beta                       3087 non-null   float64       
 9   Variance                   3087 non-null   float64       
 10  AvgTrueRange               3087 non-null   float64       
 11  Upperband                  3087 non-null   float64       
 12  Lowerband                  3087 non-null   float64       
 13  Middleband                 3087 non-null   float64       
 14  APO                        3087 non-null   float64       
 15  NATR                       3087 non-null   float64       
 16  TRANGE                     3087 non-null   float64       
 17  DMI                        3087 non-null   float64       
 18  MACD                       3087 non-null   float64       
 19  MACDSIGNAL                 3087 non-null   float64       
 20  MACDHIST                   3087 non-null   float64       
 21  MOM                        3087 non-null   float64       
 22  PPO                        3087 non-null   float64       
 23  ROCP                       3087 non-null   float64       
 24  RSI                        3087 non-null   float64       
 25  TRIX                       3087 non-null   float64       
 26  ULTOSC                     3087 non-null   float64       
 27  SLOWK                      3087 non-null   float64       
 28  SLOWD                      3087 non-null   float64       
 29  AD                         3087 non-null   float64       
 30  ADOSC                      3087 non-null   float64       
 31  OBV                        3087 non-null   float64       
 32  Upward_momentum_created    3087 non-null   float64       
 33  Downward_momentum_created  3087 non-null   float64       
 34  B5_O_Um                    3087 non-null   float64       
 35  B5_C_Um                    3087 non-null   float64       
 36  B5_E_Um                    3087 non-null   float64       
 37  B5_A_Um                    3087 non-null   float64       
 38  B5_N_Um                    3087 non-null   float64       
 39  B5_O_Dm                    3087 non-null   float64       
 40  B5_C_Dm                    3087 non-null   float64       
 41  B5_E_Dm                    3087 non-null   float64       
 42  B5_A_Dm                    3087 non-null   float64       
 43  B5_N_Dm                    3087 non-null   float64       
 44  Verified_status_True       3087 non-null   int64         
 45  Verified_status_False      3087 non-null   int64         
 46  O                          3087 non-null   int64         
 47  C                          3087 non-null   int64         
 48  E                          3087 non-null   int64         
 49  A                          3087 non-null   int64         
 50  N                          3087 non-null   int64         
 51  Fake_news                  3087 non-null   int64         
 52  returns                    3087 non-null   float64       
 53  log_returns                3087 non-null   float64       
 54  vol_current                3087 non-null   float64       
 55  vol_future                 3087 non-null   float64       
dtypes: datetime64[ns](1), float64(46), int64(9)
memory usage: 1.3 MB
In [1617]:
df.shape
Out[1617]:
(3087, 56)
In [1618]:
df=df.dropna()
In [1619]:
df.dtypes
Out[1619]:
Date                         datetime64[ns]
Open                                float64
High                                float64
Low                                 float64
Close                               float64
Adj Close                           float64
Volume                                int64
Return                              float64
Beta                                float64
Variance                            float64
AvgTrueRange                        float64
Upperband                           float64
Lowerband                           float64
Middleband                          float64
APO                                 float64
NATR                                float64
TRANGE                              float64
DMI                                 float64
MACD                                float64
MACDSIGNAL                          float64
MACDHIST                            float64
MOM                                 float64
PPO                                 float64
ROCP                                float64
RSI                                 float64
TRIX                                float64
ULTOSC                              float64
SLOWK                               float64
SLOWD                               float64
AD                                  float64
ADOSC                               float64
OBV                                 float64
Upward_momentum_created             float64
Downward_momentum_created           float64
B5_O_Um                             float64
B5_C_Um                             float64
B5_E_Um                             float64
B5_A_Um                             float64
B5_N_Um                             float64
B5_O_Dm                             float64
B5_C_Dm                             float64
B5_E_Dm                             float64
B5_A_Dm                             float64
B5_N_Dm                             float64
Verified_status_True                  int64
Verified_status_False                 int64
O                                     int64
C                                     int64
E                                     int64
A                                     int64
N                                     int64
Fake_news                             int64
returns                             float64
log_returns                         float64
vol_current                         float64
vol_future                          float64
dtype: object
In [1620]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(40,15))
sns.heatmap(df.corr(),annot=True)
Out[1620]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f077460b8d0>
In [1621]:
df.hist(figsize=(20, 32), bins=70, xlabelsize=8, ylabelsize=8);
In [1622]:
df_corr = df.corr()['AvgTrueRange'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with AvgTrueRange:\n{}".format(len(golden_features_list), golden_features_list))
There are 15 strongly correlated values with AvgTrueRange:
AvgTrueRange    1.000000
NATR            0.852028
TRANGE          0.829648
vol_future      0.782077
vol_current     0.734745
Upperband       0.651595
Variance        0.646961
Adj Close       0.630811
High            0.628366
Middleband      0.625705
Close           0.621156
Open            0.620511
Low             0.613023
Lowerband       0.597434
OBV             0.570040
Name: AvgTrueRange, dtype: float64
In [1623]:
df_corr = df.corr()['NATR'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with NATR :\n{}".format(len(golden_features_list), golden_features_list))
There are 7 strongly correlated values with NATR :
NATR            1.000000
vol_future      0.889337
AvgTrueRange    0.852028
vol_current     0.826610
Variance        0.728425
TRANGE          0.726175
MACD           -0.525614
Name: NATR, dtype: float64
In [1624]:
df_corr = df.corr()['TRANGE'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with TRANGE:\n{}".format(len(golden_features_list), golden_features_list))
There are 10 strongly correlated values with TRANGE:
TRANGE          1.000000
AvgTrueRange    0.829648
NATR            0.726175
Variance        0.628713
vol_future      0.621076
Upperband       0.530690
vol_current     0.529007
Middleband      0.506281
High            0.505848
Adj Close       0.503687
Name: TRANGE, dtype: float64
In [1625]:
df_corr = df.corr()['O'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Openness:\n{}".format(len(golden_features_list), golden_features_list))
There are 6 strongly correlated values with Openness:
Fake_news                    1.000000
O                            1.000000
Verified_status_False        0.995384
Verified_status_True         0.756332
B5_O_Dm                      0.586492
Downward_momentum_created    0.586492
Name: O, dtype: float64
In [1626]:
df_corr = df.corr()['C'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with conscientiousness:
Series([], Name: C, dtype: float64)
In [1627]:
df_corr = df.corr()['E'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with conscientiousness:
Series([], Name: E, dtype: float64)
In [1628]:
df_corr = df.corr()['A'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with conscientiousness:
Series([], Name: A, dtype: float64)
In [1629]:
df_corr = df.corr()['N'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with conscientiousness:
Series([], Name: N, dtype: float64)
In [1630]:
df.columns
Out[1630]:
Index(['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'Return',
       'Beta', 'Variance', 'AvgTrueRange', 'Upperband', 'Lowerband',
       'Middleband', 'APO', 'NATR', 'TRANGE', 'DMI', 'MACD', 'MACDSIGNAL',
       'MACDHIST', 'MOM', 'PPO', 'ROCP', 'RSI', 'TRIX', 'ULTOSC', 'SLOWK',
       'SLOWD', 'AD', 'ADOSC', 'OBV', 'Upward_momentum_created',
       'Downward_momentum_created', 'B5_O_Um', 'B5_C_Um', 'B5_E_Um', 'B5_A_Um',
       'B5_N_Um', 'B5_O_Dm', 'B5_C_Dm', 'B5_E_Dm', 'B5_A_Dm', 'B5_N_Dm',
       'Verified_status_True', 'Verified_status_False', 'O', 'C', 'E', 'A',
       'N', 'Fake_news', 'returns', 'log_returns', 'vol_current',
       'vol_future'],
      dtype='object')
In [1631]:
df_corr = df.corr()['B5_O_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_O_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_O_Um:
Series([], Name: B5_O_Um, dtype: float64)
In [1632]:
df_corr = df.corr()['B5_C_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_C_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_C_Um:
Series([], Name: B5_C_Um, dtype: float64)
In [1633]:
df_corr = df.corr()['B5_E_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_E_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_E_Um:
Series([], Name: B5_E_Um, dtype: float64)
In [1634]:
df_corr = df.corr()['B5_A_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_A_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_A_Um:
Series([], Name: B5_A_Um, dtype: float64)
In [1635]:
df_corr = df.corr()['B5_N_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_N_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_N_Um:
Series([], Name: B5_N_Um, dtype: float64)

Downward momentum correlation

In [1636]:
df_corr = df.corr()['B5_O_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_O_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 6 strongly correlated values with B5_O_Dm:
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
Verified_status_True         0.722514
Fake_news                    0.586492
O                            0.586492
Verified_status_False        0.542865
Name: B5_O_Dm, dtype: float64
In [1637]:
df_corr = df.corr()['B5_C_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_C_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_C_Dm:
Series([], Name: B5_C_Dm, dtype: float64)
In [1638]:
df_corr = df.corr()['B5_E_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_E_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_E_Dm:
Series([], Name: B5_E_Dm, dtype: float64)
In [1639]:
df_corr = df.corr()['B5_A_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_A_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_A_Dm:
Series([], Name: B5_A_Dm, dtype: float64)
In [1640]:
df_corr = df.corr()['B5_N_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_N_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_N_Dm:
Series([], Name: B5_N_Dm, dtype: float64)
In [1641]:
df_corr = df.corr()['Fake_news'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Real_or_Fake_tweet :\n{}".format(len(golden_features_list), golden_features_list))
There are 6 strongly correlated values with Real_or_Fake_tweet :
Fake_news                    1.000000
O                            1.000000
Verified_status_False        0.995384
Verified_status_True         0.756332
B5_O_Dm                      0.586492
Downward_momentum_created    0.586492
Name: Fake_news, dtype: float64
In [1642]:
df_corr = df.corr()['Downward_momentum_created'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Downward_momentum_created :\n{}".format(len(golden_features_list), golden_features_list))
There are 6 strongly correlated values with Downward_momentum_created :
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
Verified_status_True         0.722514
Fake_news                    0.586492
O                            0.586492
Verified_status_False        0.542865
Name: Downward_momentum_created, dtype: float64
In [1643]:
df_corr = df.corr()['Upward_momentum_created'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Upward_momentum_created :\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with Upward_momentum_created :
Series([], Name: Upward_momentum_created, dtype: float64)
In [1644]:
df_corr = df.corr()['Verified_status_True'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Verified_status_True :\n{}".format(len(golden_features_list), golden_features_list))
There are 6 strongly correlated values with Verified_status_True :
Verified_status_True         1.000000
Fake_news                    0.756332
O                            0.756332
B5_O_Dm                      0.722514
Downward_momentum_created    0.722514
Verified_status_False        0.690058
Name: Verified_status_True, dtype: float64
In [1645]:
df_corr = df.corr()['Verified_status_False'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Verified_status_False :\n{}".format(len(golden_features_list), golden_features_list))
There are 6 strongly correlated values with Verified_status_False :
Verified_status_False        1.000000
Fake_news                    0.995384
O                            0.995384
Verified_status_True         0.690058
B5_O_Dm                      0.542865
Downward_momentum_created    0.542865
Name: Verified_status_False, dtype: float64
In [1646]:
sns.set(font_scale=0.8)
In [1647]:
for i in range(0, len(df.columns), 5):
    sns.pairplot(data=df,
                x_vars=df.columns[i:i+5],
                y_vars=['NATR'])
In [1648]:
df.dtypes
Out[1648]:
Date                         datetime64[ns]
Open                                float64
High                                float64
Low                                 float64
Close                               float64
Adj Close                           float64
Volume                                int64
Return                              float64
Beta                                float64
Variance                            float64
AvgTrueRange                        float64
Upperband                           float64
Lowerband                           float64
Middleband                          float64
APO                                 float64
NATR                                float64
TRANGE                              float64
DMI                                 float64
MACD                                float64
MACDSIGNAL                          float64
MACDHIST                            float64
MOM                                 float64
PPO                                 float64
ROCP                                float64
RSI                                 float64
TRIX                                float64
ULTOSC                              float64
SLOWK                               float64
SLOWD                               float64
AD                                  float64
ADOSC                               float64
OBV                                 float64
Upward_momentum_created             float64
Downward_momentum_created           float64
B5_O_Um                             float64
B5_C_Um                             float64
B5_E_Um                             float64
B5_A_Um                             float64
B5_N_Um                             float64
B5_O_Dm                             float64
B5_C_Dm                             float64
B5_E_Dm                             float64
B5_A_Dm                             float64
B5_N_Dm                             float64
Verified_status_True                  int64
Verified_status_False                 int64
O                                     int64
C                                     int64
E                                     int64
A                                     int64
N                                     int64
Fake_news                             int64
returns                             float64
log_returns                         float64
vol_current                         float64
vol_future                          float64
dtype: object
In [1649]:
df.isnull().sum()
Out[1649]:
Date                         0
Open                         0
High                         0
Low                          0
Close                        0
Adj Close                    0
Volume                       0
Return                       0
Beta                         0
Variance                     0
AvgTrueRange                 0
Upperband                    0
Lowerband                    0
Middleband                   0
APO                          0
NATR                         0
TRANGE                       0
DMI                          0
MACD                         0
MACDSIGNAL                   0
MACDHIST                     0
MOM                          0
PPO                          0
ROCP                         0
RSI                          0
TRIX                         0
ULTOSC                       0
SLOWK                        0
SLOWD                        0
AD                           0
ADOSC                        0
OBV                          0
Upward_momentum_created      0
Downward_momentum_created    0
B5_O_Um                      0
B5_C_Um                      0
B5_E_Um                      0
B5_A_Um                      0
B5_N_Um                      0
B5_O_Dm                      0
B5_C_Dm                      0
B5_E_Dm                      0
B5_A_Dm                      0
B5_N_Dm                      0
Verified_status_True         0
Verified_status_False        0
O                            0
C                            0
E                            0
A                            0
N                            0
Fake_news                    0
returns                      0
log_returns                  0
vol_current                  0
vol_future                   0
dtype: int64
In [1650]:
df.fillna(0, inplace = True)
In [1651]:
df.dropna(inplace=True)
In [1652]:
sns.set(font_scale=0.8)
In [1653]:
corr = df.drop('Close', axis=1).corr() 
plt.figure(figsize=(12, 10))

sns.heatmap(corr[(corr >= 0.5) | (corr <= -0.4)], 
            cmap='YlGnBu', vmax=1.0, vmin=-1.0, linewidths=0.1,
            annot=True, annot_kws={"size": 8}, square=True);
In [1654]:
df.describe()
Out[1654]:
Open High Low Close Adj Close Volume Return Beta Variance AvgTrueRange Upperband Lowerband Middleband APO NATR TRANGE DMI MACD MACDSIGNAL MACDHIST MOM PPO ROCP RSI TRIX ULTOSC SLOWK SLOWD AD ADOSC OBV Upward_momentum_created Downward_momentum_created B5_O_Um B5_C_Um B5_E_Um B5_A_Um B5_N_Um B5_O_Dm B5_C_Dm B5_E_Dm B5_A_Dm B5_N_Dm Verified_status_True Verified_status_False O C E A N Fake_news returns log_returns vol_current vol_future
count 3087.000000 3087.000000 3087.000000 3087.000000 3087.000000 3.087000e+03 3087.000000 3087.000000 3087.000000 3087.000000 3087.000000 3087.000000 3087.000000 3087.000000 3087.000000 3087.000000 3087.000000 3087.000000 3087.000000 3087.000000 3087.000000 3087.000000 3087.000000 3087.000000 3087.000000 3087.000000 3087.000000 3087.000000 3.087000e+03 3.087000e+03 3.087000e+03 3087.0 3.087000e+03 3087.0 3087.0 3087.0 3087.0 3087.0 3.087000e+03 3087.0 3087.0 3087.0 3087.0 3087.000000 3087.000000 3087.000000 3087.0 3087.0 3087.0 3087.0 3087.000000 3087.000000 3087.000000 3087.000000 3087.000000
mean 131.984250 132.915740 131.069368 132.008678 116.698482 5.039562e+06 0.057566 0.754357 3.301995 1.991805 134.358097 129.256481 131.807289 0.449126 1.463744 1.998646 32.450604 0.446758 0.442564 0.004194 0.665296 0.344225 0.005524 54.208086 0.047440 51.408601 55.418598 55.409031 3.413349e+07 1.274834e+05 5.534145e+08 0.0 2.254863e+06 0.0 0.0 0.0 0.0 0.0 2.254863e+06 0.0 0.0 0.0 0.0 4.766764 69.863298 74.630062 0.0 0.0 0.0 0.0 74.630062 0.057283 0.000502 0.010417 0.010415
std 53.134139 53.563206 52.681888 53.123387 58.195973 2.700829e+06 1.190401 0.436954 15.158841 1.516414 54.205060 51.867377 52.985877 2.489437 0.765769 1.904328 21.191262 1.705423 1.584702 0.556799 5.130147 1.572726 0.032556 11.534948 0.099839 9.729560 24.454263 22.550256 4.064844e+07 3.499145e+06 2.502854e+08 0.0 4.949033e+06 0.0 0.0 0.0 0.0 0.0 4.949033e+06 0.0 0.0 0.0 0.0 10.226281 69.707588 77.120294 0.0 0.0 0.0 0.0 77.120294 1.190759 0.011895 0.006133 0.006133
min 54.099998 54.340000 53.880001 54.230000 38.064846 9.632000e+05 -15.875351 -1.235812 0.009706 0.633286 56.492178 53.392286 55.128571 -24.743270 0.713757 0.250000 0.021493 -17.588256 -13.691809 -6.540312 -69.720001 -13.686135 -0.336779 20.033009 -0.381169 19.665817 0.827827 3.214241 -7.760304e+07 -1.719809e+07 -1.920340e+07 0.0 1.600000e+01 0.0 0.0 0.0 0.0 0.0 1.600000e+01 0.0 0.0 0.0 0.0 0.000000 0.000000 1.000000 0.0 0.0 0.0 0.0 1.000000 -15.875351 -0.172871 0.003280 0.003280
25% 93.269997 93.820000 92.755001 93.240002 73.543133 3.215350e+06 -0.476608 0.493912 0.346494 1.072399 94.748260 91.368485 93.173573 -0.472917 1.101898 0.959999 14.715431 -0.309931 -0.255675 -0.201862 -1.220001 -0.406025 -0.010944 46.465921 -0.020536 44.983621 35.740164 37.455402 3.448097e+06 -1.796716e+06 3.308203e+08 0.0 4.982322e+05 0.0 0.0 0.0 0.0 0.0 4.982322e+05 0.0 0.0 0.0 0.0 1.000000 31.000000 33.000000 0.0 0.0 0.0 0.0 33.000000 -0.478001 -0.004791 0.007610 0.007610
50% 111.709999 112.349998 110.830002 111.260002 95.557869 4.431600e+06 0.075252 0.766876 0.813421 1.474209 113.117403 109.266895 111.965715 0.508205 1.319313 1.449997 30.070558 0.442623 0.441487 0.017684 0.809998 0.438565 0.007316 54.483394 0.053176 51.752956 57.930936 58.022557 4.116313e+07 2.425201e+05 5.635624e+08 0.0 9.626650e+05 0.0 0.0 0.0 0.0 0.0 9.626650e+05 0.0 0.0 0.0 0.0 2.000000 53.000000 56.000000 0.0 0.0 0.0 0.0 56.000000 0.075252 0.000752 0.009179 0.009173
75% 175.004997 176.595001 173.775002 175.129997 162.193283 6.109200e+06 0.600937 0.993481 2.224161 2.470715 180.204366 171.614563 175.028570 1.416794 1.605691 2.419998 47.962398 1.196784 1.118214 0.221888 2.740002 1.226771 0.024115 62.577164 0.122249 58.057533 76.528675 74.264318 6.674037e+07 2.137691e+06 8.015311e+08 0.0 2.037151e+06 0.0 0.0 0.0 0.0 0.0 2.037151e+06 0.0 0.0 0.0 0.0 5.000000 85.000000 88.000000 0.0 0.0 0.0 0.0 88.000000 0.601190 0.005994 0.011593 0.011593
max 265.559998 266.890015 264.089996 265.549988 265.549988 3.647160e+07 18.125448 2.761386 460.722560 17.786362 269.082909 260.789813 263.527139 13.018460 11.989368 29.500000 95.186611 4.978176 4.727066 4.368476 39.939987 7.768897 0.291320 91.603821 0.287018 81.418866 97.422844 97.123851 1.303757e+08 1.469641e+07 8.737332e+08 0.0 8.184269e+07 0.0 0.0 0.0 0.0 0.0 8.184269e+07 0.0 0.0 0.0 0.0 130.000000 844.000000 913.000000 0.0 0.0 0.0 0.0 913.000000 18.125448 0.166577 0.065226 0.065226
In [1655]:
# DROPPING ALL NaN VALUES
df.dropna(inplace=True)
In [1656]:
n_zoom = 365
sns.set_context("talk", font_scale=1.3)
# plt.style.use(['seaborn'])

# VISUALIZE REALIZED CURRENT VS. FUTURE VOLATILITY
with sns.axes_style("whitegrid"):
    fig, (ax1, ax2) = plt.subplots(nrows=2, ncols=1, figsize=(18,14))

    ax1.plot(df.vol_current, alpha=.8, lw=1, color='gray', ls=':',
            label='Current Volatility')
    ax1.plot(df.vol_future, lw=1, color='blue',
            label=f'Next {n_future} Days Volatility (TARGET)')

    ax2.plot(df.vol_current[-n_zoom:], alpha=.8, lw=2, color='gray', ls=':',
            label='Current Volatility')
    ax2.plot(df.vol_future[-n_zoom:], lw=2, color='blue',
            label=f'Next {n_future} Days Volatility (TARGET)')

    ax1.title.set_text(f'Future vs. Current Daily Volatility \n Using {INTERVAL_WINDOW}-Day Interval')
    ax2.title.set_text(f'Zooming in the Last {n_zoom} Days')

    ax1.legend(loc='upper left', prop={'size': 13}, frameon=True)
    ax2.legend(loc='upper left', prop={'size': 13}, frameon=True)
    plt.tight_layout()
    
    plt.show();

Daily Volatility Distribution

In [1657]:
with sns.axes_style("darkgrid"):
    fig, ax = plt.subplots(figsize=(10,6))
    sns.distplot(df.vol_current, norm_hist=True, fit=stats.norm,
                bins=50, ax=ax)
    plt.title('Daily Volatility Distribution')
    
    plt.show();

Experiment 2: weekly granularity

In [1659]:
w = widgets.Dropdown(
    options=['SELECT','AAPL', 'ABUS', 'ARDS', 'BABA','BFRI', 
             'FB', 'GME', 'MCD','PFE', 'PLUG', 
             'QCOM', 'SENS','TSLA', 'TWTR', 'UUUU'],
    value='SELECT',
    description ='Stock name:',

)

def on_change(change):
    if change['type'] == 'change' and change['name'] == 'value':
        print("You have selected %s" % change['new'])

w.observe(on_change)

display(w)
You have selected MCD
In [1660]:
if(w.value == 'AAPL'):
  df = pd.read_csv('/content/Final_AAPL.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'ABUS'):
  df = pd.read_csv('/content/Final_ABUS.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'ARDS'):
  df = pd.read_csv('/content/Final_ARDS.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'BABA'):
  df = pd.read_csv('/content/Final_BABA.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'BFRI'):
  df = pd.read_csv('/content/Final_BFRI.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'FB'):
  df = pd.read_csv('/content/Final_FB.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'GME'):
  df = pd.read_csv('/content/Final_GME.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'MCD'):
  df = pd.read_csv('/content/Final_MCD.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'PFE'):
  df = pd.read_csv('/content/Final_PFE.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'PLUG'):
  df = pd.read_csv('/content/Final_PLUG.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'QCOM'):
  df = pd.read_csv('/content/Final_QCOM.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'SENS'):
  df = pd.read_csv('/content/Final_SENS.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'TSLA'):
  df = pd.read_csv('/content/Final_TSLA.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'TWTR'):
  df = pd.read_csv('/content/Final_TWTR.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'UUUU'):
  df = pd.read_csv('/content/Final_UUUU.csv', parse_dates=['Date'], index_col=['Date'])
In [1661]:
df.columns
Out[1661]:
Index(['Unnamed: 0', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume',
       'Return', 'Beta', 'Variance', 'AvgTrueRange', 'Upperband', 'Lowerband',
       'Middleband', 'APO', 'NATR', 'TRANGE', 'DMI', 'MACD', 'MACDSIGNAL',
       'MACDHIST', 'MOM', 'PPO', 'ROCP', 'RSI', 'TRIX', 'ULTOSC', 'SLOWK',
       'SLOWD', 'AD', 'ADOSC', 'OBV', 'Upward_momentum_created',
       'Downward_momentum_created', 'B5_O_Um', 'B5_C_Um', 'B5_E_Um', 'B5_A_Um',
       'B5_N_Um', 'B5_O_Dm', 'B5_C_Dm', 'B5_E_Dm', 'B5_A_Dm', 'B5_N_Dm',
       'Verified_status_True', 'Verified_status_False', 'O', 'C', 'E', 'A',
       'N', 'Real_or_Fake_tweet'],
      dtype='object')
In [1662]:
df.shape
Out[1662]:
(3153, 52)
In [1663]:
df.isnull().sum()
Out[1663]:
Unnamed: 0                    0
Open                          0
High                          0
Low                           0
Close                         0
Adj Close                     0
Volume                        0
Return                        0
Beta                          0
Variance                      0
AvgTrueRange                  0
Upperband                     0
Lowerband                     0
Middleband                    0
APO                           4
NATR                          0
TRANGE                        0
DMI                           0
MACD                         12
MACDSIGNAL                   12
MACDHIST                     12
MOM                           0
PPO                           4
ROCP                          0
RSI                           0
TRIX                         66
ULTOSC                        7
SLOWK                         0
SLOWD                         0
AD                            0
ADOSC                         0
OBV                           0
Upward_momentum_created       0
Downward_momentum_created     0
B5_O_Um                       0
B5_C_Um                       0
B5_E_Um                       0
B5_A_Um                       0
B5_N_Um                       0
B5_O_Dm                       0
B5_C_Dm                       0
B5_E_Dm                       0
B5_A_Dm                       0
B5_N_Dm                       0
Verified_status_True          0
Verified_status_False         0
O                             0
C                             0
E                             0
A                             0
N                             0
Real_or_Fake_tweet            0
dtype: int64
In [1664]:
df = df.fillna(df.median())
del df['Unnamed: 0']
df.rename(columns = {'Real_or_Fake_tweet': 'Fake_news'}, inplace = True)
In [1665]:
df_weekly = df.resample('W').agg('mean')
In [1666]:
df_weekly.shape
Out[1666]:
(655, 51)
In [1667]:
plt.figure(figsize=(40,15))
sns.heatmap(df_weekly.corr(),annot=True)
Out[1667]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f077d542210>
In [1668]:
sns.set(font_scale=0.8)
In [1669]:
df_weekly.hist(figsize=(20, 32), bins=50, xlabelsize=8, ylabelsize=8);
In [1670]:
df_corr = df_weekly.corr()['AvgTrueRange'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with AvgTrueRange:\n{}".format(len(golden_features_list), golden_features_list))
There are 13 strongly correlated values with AvgTrueRange:
AvgTrueRange    1.000000
TRANGE          0.929753
NATR            0.838928
Variance        0.686690
Upperband       0.658002
Adj Close       0.639181
High            0.636169
Middleband      0.633454
Close           0.629491
Open            0.628959
Low             0.621985
Lowerband       0.606844
OBV             0.574528
Name: AvgTrueRange, dtype: float64
In [1671]:
df_corr = df_weekly.corr()['NATR'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with NATR :\n{}".format(len(golden_features_list), golden_features_list))
There are 5 strongly correlated values with NATR :
NATR            1.000000
AvgTrueRange    0.838928
TRANGE          0.790817
Variance        0.766414
MACD           -0.521850
Name: NATR, dtype: float64
In [1672]:
df_corr = df_weekly.corr()['TRANGE'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with TRANGE:\n{}".format(len(golden_features_list), golden_features_list))
There are 13 strongly correlated values with TRANGE:
TRANGE          1.000000
AvgTrueRange    0.929753
NATR            0.790817
Variance        0.761612
Upperband       0.612472
Middleband      0.586001
Adj Close       0.585576
High            0.585058
Open            0.577148
Close           0.576980
Low             0.568406
Lowerband       0.557454
OBV             0.526129
Name: TRANGE, dtype: float64
In [1673]:
df_corr = df_weekly.corr()['O'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Openness:\n{}".format(len(golden_features_list), golden_features_list))
There are 6 strongly correlated values with Openness:
Fake_news                    1.000000
O                            1.000000
Verified_status_False        0.997392
Verified_status_True         0.709465
B5_O_Dm                      0.566073
Downward_momentum_created    0.566073
Name: O, dtype: float64
In [1674]:
df_corr = df_weekly.corr()['C'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with conscientiousness:
Series([], Name: C, dtype: float64)
In [1675]:
df_corr = df_weekly.corr()['E'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with conscientiousness:
Series([], Name: E, dtype: float64)
In [1676]:
df_corr = df_weekly.corr()['A'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with conscientiousness:
Series([], Name: A, dtype: float64)
In [1677]:
df_corr = df_weekly.corr()['N'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with conscientiousness:
Series([], Name: N, dtype: float64)
In [1678]:
df_corr = df_weekly.corr()['B5_O_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_O_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_O_Um:
Series([], Name: B5_O_Um, dtype: float64)
In [1679]:
df_corr = df_weekly.corr()['B5_C_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_C_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_C_Um:
Series([], Name: B5_C_Um, dtype: float64)
In [1680]:
df_corr = df_weekly.corr()['B5_E_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_E_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_E_Um:
Series([], Name: B5_E_Um, dtype: float64)
In [1681]:
df_corr = df_weekly.corr()['B5_A_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_A_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_A_Um:
Series([], Name: B5_A_Um, dtype: float64)
In [1682]:
df_corr = df_weekly.corr()['B5_N_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_N_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_N_Um:
Series([], Name: B5_N_Um, dtype: float64)

Downward momentum correlation

In [1683]:
df_corr = df_weekly.corr()['B5_O_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_O_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 6 strongly correlated values with B5_O_Dm:
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
Verified_status_True         0.786379
Fake_news                    0.566073
O                            0.566073
Verified_status_False        0.525187
Name: B5_O_Dm, dtype: float64
In [1684]:
df_corr = df_weekly.corr()['B5_C_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_C_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_C_Dm:
Series([], Name: B5_C_Dm, dtype: float64)
In [1685]:
df_corr = df_weekly.corr()['B5_E_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_E_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_E_Dm:
Series([], Name: B5_E_Dm, dtype: float64)
In [1686]:
df_corr = df_weekly.corr()['B5_A_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_A_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_A_Dm:
Series([], Name: B5_A_Dm, dtype: float64)
In [1687]:
df_corr = df_weekly.corr()['B5_N_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_N_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_N_Dm:
Series([], Name: B5_N_Dm, dtype: float64)
In [1688]:
df_corr = df_weekly.corr()['Fake_news'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Real_or_Fake_tweet :\n{}".format(len(golden_features_list), golden_features_list))
There are 6 strongly correlated values with Real_or_Fake_tweet :
Fake_news                    1.000000
O                            1.000000
Verified_status_False        0.997392
Verified_status_True         0.709465
B5_O_Dm                      0.566073
Downward_momentum_created    0.566073
Name: Fake_news, dtype: float64
In [1689]:
df_corr = df_weekly.corr()['Downward_momentum_created'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Downward_momentum_created :\n{}".format(len(golden_features_list), golden_features_list))
There are 6 strongly correlated values with Downward_momentum_created :
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
Verified_status_True         0.786379
Fake_news                    0.566073
O                            0.566073
Verified_status_False        0.525187
Name: Downward_momentum_created, dtype: float64
In [1690]:
df_corr = df_weekly.corr()['Upward_momentum_created'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Upward_momentum_created :\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with Upward_momentum_created :
Series([], Name: Upward_momentum_created, dtype: float64)
In [1691]:
df_corr = df_weekly.corr()['Verified_status_True'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Verified_status_True :\n{}".format(len(golden_features_list), golden_features_list))
There are 6 strongly correlated values with Verified_status_True :
Verified_status_True         1.000000
B5_O_Dm                      0.786379
Downward_momentum_created    0.786379
Fake_news                    0.709465
O                            0.709465
Verified_status_False        0.656746
Name: Verified_status_True, dtype: float64
In [1692]:
df_corr = df_weekly.corr()['Verified_status_False'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Verified_status_False :\n{}".format(len(golden_features_list), golden_features_list))
There are 6 strongly correlated values with Verified_status_False :
Verified_status_False        1.000000
Fake_news                    0.997392
O                            0.997392
Verified_status_True         0.656746
B5_O_Dm                      0.525187
Downward_momentum_created    0.525187
Name: Verified_status_False, dtype: float64
In [1693]:
sns.set(font_scale=0.8)
In [1694]:
for i in range(0, len(df_weekly.columns), 5):
    sns.pairplot(data=df_weekly,
                x_vars=df_weekly.columns[i:i+5],
                y_vars=['NATR'])
In [1695]:
df_weekly.fillna(0, inplace = True)
In [1696]:
df_weekly.dropna(inplace=True)
In [1697]:
corr = df_weekly.drop('Close', axis=1).corr() 
plt.figure(figsize=(12, 10))

sns.heatmap(corr[(corr >= 0.5) | (corr <= -0.4)], 
            cmap='YlGnBu', vmax=1.0, vmin=-1.0, linewidths=0.1,
            annot=True, annot_kws={"size": 8}, square=True);

Weekly volatility distribution

In [1698]:
with sns.axes_style("darkgrid"):
    fig, ax = plt.subplots(figsize=(10,6))
    sns.distplot(df_weekly.NATR, norm_hist=True, fit=stats.norm,
                bins=50, ax=ax)
    plt.title('Weekly Volatility Distribution')
    
    plt.show();